{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from requests_html import HTMLSession\n",
    "import requests_html\n",
    "import pandas as pd\n",
    "import urllib.parse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "200"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "session = HTMLSession()\n",
    "r = session.get(\"https://www.nfu.edu.cn/ztb/index.htm\")\n",
    "r.status_code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 存\n",
    "with open (\"html_out/_nfu_ztb.html\", encoding = \"utf8\", mode = \"w\") as fp:\n",
    "    fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读\n",
    "with open (\"html_out/_nfu_ztb.html\", encoding = \"utf8\", mode = \"r\") as fp:\n",
    "    html_load = fp.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Element html at 0x2094e9ee0e0>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "parsed = requests_html.soup_parse(html_load)\n",
    "parsed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ParseResult(scheme='https', netloc='www.nfu.edu.cn', path='/ztb/index.htm', params='', query='', fragment='')"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "base_url = r.url\n",
    "nfu_urlparse = urllib.parse.urlparse(base_url)\n",
    "nfu_urlparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/ztb/4aa14103a6d34d42837fa4325389300f.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/ea8754261f26419080ae1933f5ae7f2a.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/7226fe9acf3b4757b972599c1c947ffe.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/414b2db5e6c04f99be1096effc050fe1.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/60c660848ef44283bcae0b864f06245b.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/c1f45c4ed6d24523b8015716f1354c69.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/8de22fa69c5a4718a5d3a234157a231a.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/84df006147494c74a06300e42ba5fe0f.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/5b6a96bc894e4901b9015550f495d48c.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/cee6034ea34b4d37af132488e5b08eba.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/2b0efb94d7bc43a69cf7705d8e5eb3fb.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/4ca38f35a904483aa17f8149b6e74a5f.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/d9a43543bfc04b249605e362c4d56fde.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/a94be158ee2d45629fa34fe777c524aa.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/6c02c38297c94f82a0b00e0c465ecf42.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/f312609072284e918844133b2e8d17de.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/711839de4a50406da99b621c5c60f53a.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/ef39ea1df91b4208859c3d139614b752.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/303ff597654847ad9fe7ec28cee402b3.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/3c3b1ca74f0b47e1a6f9a6e2434014aa.htm']"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 重组链接\n",
    "list_URL  = [urllib.parse.urlunparse\\\n",
    "([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "for detail_url in parsed.xpath('//div[@class=\"news_title\"]/a/@href')]  \n",
    "list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>标题</th>\n",
       "      <th>链接</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>广州南方学院蚊子、苍蝇、蟑螂消杀及白蚁、红火蚁防治项目招标开标延期公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/4aa14103a6d34d42837...</td>\n",
       "      <td>2021-04-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>广州南方学院蚊子、苍蝇、蟑螂消杀及白蚁、红火蚁防治项目 招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/ea8754261f26419080a...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>中山大学南方学院数字电路基础实验室、电路与模拟电子实验室设备采购项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/7226fe9acf3b4757b97...</td>\n",
       "      <td>2021-03-31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>中山大学南方学院垃圾清运和处理服务项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/414b2db5e6c04f99be1...</td>\n",
       "      <td>2021-03-17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>中山大学南方学院2021年度维修、改造工程施工项目中标结果公示</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/60c660848ef44283bca...</td>\n",
       "      <td>2021-03-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>中山大学南方学院校舍家电采购项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/c1f45c4ed6d24523b80...</td>\n",
       "      <td>2021-03-05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>中山大学南方学院办公电脑采购项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/8de22fa69c5a4718a5d...</td>\n",
       "      <td>2021-03-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>中山大学南方学院2021年度维修、改造工程施工项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/84df006147494c74a06...</td>\n",
       "      <td>2021-01-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>中山大学南方学院2021年度维修、改造工程施工项目流标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/5b6a96bc894e4901b90...</td>\n",
       "      <td>2021-01-25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>中山大学南方学院2021年度维修、改造工程施工项目招标开标延期公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/cee6034ea34b4d37af1...</td>\n",
       "      <td>2021-01-05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>中山大学南方学院2021年度维修、改造工程施工项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/2b0efb94d7bc43a69cf...</td>\n",
       "      <td>2020-12-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>中山大学南方学院网络技术与媒体研发平台实验室设备采购项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/4ca38f35a904483aa17...</td>\n",
       "      <td>2020-11-18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>中山大学南方学院计算机组成原理实验箱采购项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/d9a43543bfc04b24960...</td>\n",
       "      <td>2020-11-13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>中山大学南方学院2020-2021办公电脑采购项目​招标开标延期公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/a94be158ee2d45629fa...</td>\n",
       "      <td>2020-09-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>中山大学南方学院2020-2021办公电脑采购项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/6c02c38297c94f82a0b...</td>\n",
       "      <td>2020-09-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>中山大学南方学院容灾自动备份系统采购项目 招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/f312609072284e91884...</td>\n",
       "      <td>2020-08-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>中山大学南方学院九期教工宿舍家具采购和安装项目中标结果公示</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/711839de4a50406da99...</td>\n",
       "      <td>2020-08-14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>中山大学南方学院嵌入式创新实验室家具采购项目招标开标延期公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/ef39ea1df91b4208859...</td>\n",
       "      <td>2020-07-24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>中山大学南方学院跨境电商平台系统采购项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/303ff597654847ad9fe...</td>\n",
       "      <td>2020-07-24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>中山大学南方学院九期教工宿舍卫浴设施采购和安装项目招标开标延期公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/3c3b1ca74f0b47e1a6f...</td>\n",
       "      <td>2020-07-17</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        标题  \\\n",
       "0      广州南方学院蚊子、苍蝇、蟑螂消杀及白蚁、红火蚁防治项目招标开标延期公告   \n",
       "1         广州南方学院蚊子、苍蝇、蟑螂消杀及白蚁、红火蚁防治项目 招标公告   \n",
       "2   中山大学南方学院数字电路基础实验室、电路与模拟电子实验室设备采购项目招标公告   \n",
       "3                  中山大学南方学院垃圾清运和处理服务项目招标公告   \n",
       "4          中山大学南方学院2021年度维修、改造工程施工项目中标结果公示   \n",
       "5                     中山大学南方学院校舍家电采购项目招标公告   \n",
       "6                     中山大学南方学院办公电脑采购项目招标公告   \n",
       "7            中山大学南方学院2021年度维修、改造工程施工项目招标公告   \n",
       "8            中山大学南方学院2021年度维修、改造工程施工项目流标公告   \n",
       "9        中山大学南方学院2021年度维修、改造工程施工项目招标开标延期公告   \n",
       "10           中山大学南方学院2021年度维修、改造工程施工项目招标公告   \n",
       "11        中山大学南方学院网络技术与媒体研发平台实验室设备采购项目招标公告   \n",
       "12              中山大学南方学院计算机组成原理实验箱采购项目招标公告   \n",
       "13      中山大学南方学院2020-2021办公电脑采购项目​招标开标延期公告   \n",
       "14           中山大学南方学院2020-2021办公电脑采购项目招标公告   \n",
       "15               中山大学南方学院容灾自动备份系统采购项目 招标公告   \n",
       "16           中山大学南方学院九期教工宿舍家具采购和安装项目中标结果公示   \n",
       "17          中山大学南方学院嵌入式创新实验室家具采购项目招标开标延期公告   \n",
       "18                中山大学南方学院跨境电商平台系统采购项目招标公告   \n",
       "19       中山大学南方学院九期教工宿舍卫浴设施采购和安装项目招标开标延期公告   \n",
       "\n",
       "                                                   链接          日期  \n",
       "0   https://www.nfu.edu.cn/ztb/4aa14103a6d34d42837...  2021-04-08  \n",
       "1   https://www.nfu.edu.cn/ztb/ea8754261f26419080a...  2021-04-02  \n",
       "2   https://www.nfu.edu.cn/ztb/7226fe9acf3b4757b97...  2021-03-31  \n",
       "3   https://www.nfu.edu.cn/ztb/414b2db5e6c04f99be1...  2021-03-17  \n",
       "4   https://www.nfu.edu.cn/ztb/60c660848ef44283bca...  2021-03-11  \n",
       "5   https://www.nfu.edu.cn/ztb/c1f45c4ed6d24523b80...  2021-03-05  \n",
       "6   https://www.nfu.edu.cn/ztb/8de22fa69c5a4718a5d...  2021-03-03  \n",
       "7   https://www.nfu.edu.cn/ztb/84df006147494c74a06...  2021-01-26  \n",
       "8   https://www.nfu.edu.cn/ztb/5b6a96bc894e4901b90...  2021-01-25  \n",
       "9   https://www.nfu.edu.cn/ztb/cee6034ea34b4d37af1...  2021-01-05  \n",
       "10  https://www.nfu.edu.cn/ztb/2b0efb94d7bc43a69cf...  2020-12-11  \n",
       "11  https://www.nfu.edu.cn/ztb/4ca38f35a904483aa17...  2020-11-18  \n",
       "12  https://www.nfu.edu.cn/ztb/d9a43543bfc04b24960...  2020-11-13  \n",
       "13  https://www.nfu.edu.cn/ztb/a94be158ee2d45629fa...  2020-09-03  \n",
       "14  https://www.nfu.edu.cn/ztb/6c02c38297c94f82a0b...  2020-09-01  \n",
       "15  https://www.nfu.edu.cn/ztb/f312609072284e91884...  2020-08-15  \n",
       "16  https://www.nfu.edu.cn/ztb/711839de4a50406da99...  2020-08-14  \n",
       "17  https://www.nfu.edu.cn/ztb/ef39ea1df91b4208859...  2020-07-24  \n",
       "18  https://www.nfu.edu.cn/ztb/303ff597654847ad9fe...  2020-07-24  \n",
       "19  https://www.nfu.edu.cn/ztb/3c3b1ca74f0b47e1a6f...  2020-07-17  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 输出结果\n",
    "# B-D-1 pd.DataFrame 建构，pandas课有教\n",
    "df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath('//div[@class=\"news_title\"]/a/@title'),\n",
    "         \"链接\": list_URL,\n",
    "         \"日期\": parsed.xpath('//font[@class=\"right-more\"]/text()'),\n",
    "     } )\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_excel(\"data_out/nfu_ztb.xlsx\", sheet_name=\"检索结果\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "200\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n",
      "404\n"
     ]
    }
   ],
   "source": [
    "for i in range(1,100):\n",
    "    r = session.get('https://www.nfu.edu.cn/ztb/index'+str(i)+'.htm')\n",
    "    print(r.status_code)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "22\n"
     ]
    }
   ],
   "source": [
    "for i in range(1,100):\n",
    "    r = session.get('https://www.nfu.edu.cn/ztb/index'+str(i)+'.htm')\n",
    "    if r.status_code != 200:\n",
    "        print(i)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/ztb/index1.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index2.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index3.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index4.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index5.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index6.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index7.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index8.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index9.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index10.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index11.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index12.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index13.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index14.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index15.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index16.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index17.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index18.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index19.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index20.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index21.htm']"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url_group = ['https://www.nfu.edu.cn/ztb/index'+str(i)+'.htm' for i in range(1,22)]\n",
    "url_group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/ztb/index1.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index2.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index3.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index4.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index5.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index6.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index7.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index8.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index9.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index10.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index11.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index12.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index13.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index14.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index15.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index16.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index17.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index18.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index19.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index20.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index21.htm']"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url_group = ['https://www.nfu.edu.cn/ztb/index'+str(i)+'.htm' for i in range(1,22)]\n",
    "url_group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "for url in url_group:\n",
    "    r = session.get(url)\n",
    "#     print(r.html.html)\n",
    "#     设置存档HTML的名称\n",
    "    path = urllib.parse.urlparse(url).path\n",
    "    with open ('html_out/'+path, encoding = \"utf8\", mode = \"w\") as fp:\n",
    "        fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# xpath 准备：\n",
    "dict_xpath = {\n",
    "    '链接_xpath':'//div[@class=\"news_title\"]/a/@href',\n",
    "    '标题_xpath':'//div[@class=\"news_title\"]/a/@title',\n",
    "    '日期_xpath':'//font[@class=\"right-more\"]/text()'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pages_content_url(parsed):\n",
    "    list_URL  = [urllib.parse.urlunparse\\\n",
    "                 ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "                 for detail_url in parsed.xpath(dict_xpath['链接_xpath'])]\n",
    "    return list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['index1.htm', 'index10.htm', 'index11.htm', 'index12.htm', 'index13.htm', 'index14.htm', 'index15.htm', 'index16.htm', 'index17.htm', 'index18.htm', 'index19.htm', 'index2.htm', 'index20.htm', 'index21.htm', 'index22.htm', 'index23.htm', 'index24.htm', 'index25.htm', 'index26.htm', 'index27.htm', 'index28.htm', 'index29.htm', 'index3.htm', 'index30.htm', 'index31.htm', 'index32.htm', 'index33.htm', 'index34.htm', 'index35.htm', 'index36.htm', 'index37.htm', 'index38.htm', 'index39.htm', 'index4.htm', 'index40.htm', 'index41.htm', 'index42.htm', 'index43.htm', 'index44.htm', 'index45.htm', 'index46.htm', 'index47.htm', 'index48.htm', 'index49.htm', 'index5.htm', 'index50.htm', 'index51.htm', 'index52.htm', 'index53.htm', 'index54.htm', 'index55.htm', 'index56.htm', 'index57.htm', 'index58.htm', 'index59.htm', 'index6.htm', 'index60.htm', 'index61.htm', 'index62.htm', 'index63.htm', 'index64.htm', 'index65.htm', 'index66.htm', 'index67.htm', 'index68.htm', 'index69.htm', 'index7.htm', 'index70.htm', 'index71.htm', 'index72.htm', 'index73.htm', 'index74.htm', 'index75.htm', 'index76.htm', 'index77.htm', 'index78.htm', 'index79.htm', 'index8.htm', 'index80.htm', 'index81.htm', 'index82.htm', 'index83.htm', 'index84.htm', 'index85.htm', 'index86.htm', 'index87.htm', 'index88.htm', 'index89.htm', 'index9.htm', 'index90.htm']\n"
     ]
    }
   ],
   "source": [
    "import os     # 读取html文件\n",
    "\n",
    "list_df = []\n",
    "\n",
    "\n",
    "files= os.listdir('html_out/ztb/')\n",
    "print(files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['index1.htm', 'index10.htm', 'index11.htm', 'index12.htm', 'index13.htm', 'index14.htm', 'index15.htm', 'index16.htm', 'index17.htm', 'index18.htm', 'index19.htm', 'index2.htm', 'index20.htm', 'index21.htm', 'index22.htm', 'index23.htm', 'index24.htm', 'index25.htm', 'index26.htm', 'index27.htm', 'index28.htm', 'index29.htm', 'index3.htm', 'index30.htm', 'index31.htm', 'index32.htm', 'index33.htm', 'index34.htm', 'index35.htm', 'index36.htm', 'index37.htm', 'index38.htm', 'index39.htm', 'index4.htm', 'index40.htm', 'index41.htm', 'index42.htm', 'index43.htm', 'index44.htm', 'index45.htm', 'index46.htm', 'index47.htm', 'index48.htm', 'index49.htm', 'index5.htm', 'index50.htm', 'index51.htm', 'index52.htm', 'index53.htm', 'index54.htm', 'index55.htm', 'index56.htm', 'index57.htm', 'index58.htm', 'index59.htm', 'index6.htm', 'index60.htm', 'index61.htm', 'index62.htm', 'index63.htm', 'index64.htm', 'index65.htm', 'index66.htm', 'index67.htm', 'index68.htm', 'index69.htm', 'index7.htm', 'index70.htm', 'index71.htm', 'index72.htm', 'index73.htm', 'index74.htm', 'index75.htm', 'index76.htm', 'index77.htm', 'index78.htm', 'index79.htm', 'index8.htm', 'index80.htm', 'index81.htm', 'index82.htm', 'index83.htm', 'index84.htm', 'index85.htm', 'index86.htm', 'index87.htm', 'index88.htm', 'index89.htm', 'index9.htm', 'index90.htm']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>标题</th>\n",
       "      <th>链结</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>中山大学南方学院九期教工宿舍窗帘采购安装项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/bb5be88f7b7f424dab2...</td>\n",
       "      <td>2020-07-17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>中山大学南方学院嵌入式创新实验室家具采购项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/e71e4eac29194915ac3...</td>\n",
       "      <td>2020-07-17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>中山大学南方学院九期教工宿舍卫浴设施采购和安装项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/8ee35c7b7d094cbc84a...</td>\n",
       "      <td>2020-07-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>中山大学南方学院九期教工宿舍家具采购和安装项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/4627d138819f481c903...</td>\n",
       "      <td>2020-07-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>中山大学南方学院数字技术平台实验室电脑设备采购项目中标结果公示</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/9aa6d2284a084531966...</td>\n",
       "      <td>2020-06-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269</th>\n",
       "      <td>9</td>\n",
       "      <td>中山大学南方学院学生体质健康测试仪采购项目招标公告（第二次）</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/ba01c43761e245d4937...</td>\n",
       "      <td>2015-03-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>270</th>\n",
       "      <td>10</td>\n",
       "      <td>中山大学南方学院计算机实验室设备采购项目中标公示</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/0020f85b9ef24d0792d...</td>\n",
       "      <td>2015-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>271</th>\n",
       "      <td>11</td>\n",
       "      <td>中山大学南方学院电气工程及自动化实验室设备采购项目招标公告（第二次）</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/10482a669fc54447aa2...</td>\n",
       "      <td>2015-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>272</th>\n",
       "      <td>12</td>\n",
       "      <td>中山大学南方学院音乐楼阶梯课室座椅采购项目中标公示</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/4e5e67a17b7d47cf8cc...</td>\n",
       "      <td>2015-03-20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>273</th>\n",
       "      <td>13</td>\n",
       "      <td>中山大学南方学院室内高尔夫模拟设备项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/35a1b4dab36a4ae5aa4...</td>\n",
       "      <td>2013-12-23</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>414 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     index                                  标题  \\\n",
       "0        0          中山大学南方学院九期教工宿舍窗帘采购安装项目招标公告   \n",
       "1        1          中山大学南方学院嵌入式创新实验室家具采购项目招标公告   \n",
       "2        2       中山大学南方学院九期教工宿舍卫浴设施采购和安装项目招标公告   \n",
       "3        3         中山大学南方学院九期教工宿舍家具采购和安装项目招标公告   \n",
       "4        4     中山大学南方学院数字技术平台实验室电脑设备采购项目中标结果公示   \n",
       "..     ...                                 ...   \n",
       "269      9      中山大学南方学院学生体质健康测试仪采购项目招标公告（第二次）   \n",
       "270     10            中山大学南方学院计算机实验室设备采购项目中标公示   \n",
       "271     11  中山大学南方学院电气工程及自动化实验室设备采购项目招标公告（第二次）   \n",
       "272     12           中山大学南方学院音乐楼阶梯课室座椅采购项目中标公示   \n",
       "273     13             中山大学南方学院室内高尔夫模拟设备项目招标公告   \n",
       "\n",
       "                                                    链结          日期  \n",
       "0    https://www.nfu.edu.cn/ztb/bb5be88f7b7f424dab2...  2020-07-17  \n",
       "1    https://www.nfu.edu.cn/ztb/e71e4eac29194915ac3...  2020-07-17  \n",
       "2    https://www.nfu.edu.cn/ztb/8ee35c7b7d094cbc84a...  2020-07-12  \n",
       "3    https://www.nfu.edu.cn/ztb/4627d138819f481c903...  2020-07-01  \n",
       "4    https://www.nfu.edu.cn/ztb/9aa6d2284a084531966...  2020-06-02  \n",
       "..                                                 ...         ...  \n",
       "269  https://www.nfu.edu.cn/ztb/ba01c43761e245d4937...  2015-03-27  \n",
       "270  https://www.nfu.edu.cn/ztb/0020f85b9ef24d0792d...  2015-03-26  \n",
       "271  https://www.nfu.edu.cn/ztb/10482a669fc54447aa2...  2015-03-26  \n",
       "272  https://www.nfu.edu.cn/ztb/4e5e67a17b7d47cf8cc...  2015-03-20  \n",
       "273  https://www.nfu.edu.cn/ztb/35a1b4dab36a4ae5aa4...  2013-12-23  \n",
       "\n",
       "[414 rows x 4 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "list_df = []\n",
    "\n",
    "# os listdir取出某文件夹中的文件的名称组成列表\n",
    "files= os.listdir('html_out/ztb/')\n",
    "print(files)\n",
    "\n",
    "for html in files:\n",
    "    with open('html_out/ztb/'+html,encoding='utf8',mode='r') as fp:\n",
    "        html_load = fp.read()\n",
    "        parsed = requests_html.soup_parse(html_load)\n",
    "        list_URL = pages_content_url(parsed)\n",
    "        \n",
    "        df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath(dict_xpath['标题_xpath']),\n",
    "         \"链结\": list_URL,\n",
    "         \"日期\": parsed.xpath(dict_xpath['日期_xpath']),\n",
    "        } )\n",
    "        list_df.append(df)\n",
    "\n",
    "\n",
    "# 凭借表格pandas 方法concat\n",
    "# 数据分析的方法 concat拼接DataFrame  sort_values排序\n",
    "df_all = pd.concat(list_df).reset_index().sort_values(by='日期',ascending=False)\n",
    "display(df_all)    \n",
    "\n",
    "with pd.ExcelWriter('data_out/nfu_招投标.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_all.to_excel(writer, sheet_name='招投标')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
